import pytesseract, fitz
from pdf2image import convert_from_path

# 指定 Tesseract-OCR 的路径
pytesseract.pytesseract.tesseract_cmd = r"D:\tool\Tesseract OCR\tesseract.exe"
poppler_path = r'D:\tool\poppler-24.08.0\Library\bin'

def extract_text_mixed(pdf_path):
    doc = fitz.open(pdf_path)
    images = []

    for i in range(len(doc)):
        text = doc[i].get_text("text").replace(" ", "")
        
        if not text.strip():  # 如果没有文字，尝试OCR
            if not images:
                images = convert_from_path(pdf_path, poppler_path=poppler_path)
            print(f"第 {i + 1} 页没有文字，尝试OCR")
            text = pytesseract.image_to_string(images[i], lang="chi_sim").replace(" ", "")
        
        if "测试合同" in text:
            print(text)
            print(f"第 {i + 1} 页包含关键字")

extract_text_mixed("../联合打印1.pdf")